{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "import random\n",
    "\n",
    "punctuation = '!()&%{}[];:\\'\",./?\\\\<>'\n",
    "\n",
    "def remove_punc(string):\n",
    "    string = re.sub('[^A-Za-z0-9 ]+', ' ', string)\n",
    "    string = re.sub(r'[ ]+', ' ', string).strip()\n",
    "    return string\n",
    "\n",
    "def remove_punc_random(string, threshold = 0.3):\n",
    "    result = []\n",
    "    for c in string:\n",
    "        if c in punctuation and random.random() >= threshold:\n",
    "            continue\n",
    "            \n",
    "        result.append(c)\n",
    "    return ''.join(result)\n",
    "\n",
    "def add_punc_random(string, threshold = 0.7):\n",
    "    string = string.split()\n",
    "    for i in range(len(string)):\n",
    "        if random.random() >= threshold and string[i][-1] not in punctuation:\n",
    "            string[i] = string[i] + random.choice(punctuation)\n",
    "    return ' '.join(string)\n",
    "\n",
    "def add_space_after_punc(string, threshold = 0.3):\n",
    "    string = string.split()\n",
    "    for i in range(len(string)):\n",
    "        if random.random() >= threshold and string[i][-1] in punctuation:\n",
    "            string[i] = string[i][:-1] + ' ' + string[i][-1]\n",
    "    return ' '.join(string)\n",
    "\n",
    "def replace_random_punc(string):\n",
    "    string = string.split()\n",
    "    for i in range(len(string)):\n",
    "        if string[i][-1] in punctuation:\n",
    "            string[i] = string[i][:-1] + random.choice(punctuation)\n",
    "    return ' '.join(string)\n",
    "\n",
    "def random_upper(string, threshold = 0.5):\n",
    "    string = string.split()\n",
    "    for i in range(len(string)):\n",
    "        if random.random() >= threshold:\n",
    "            string[i] = string[i].upper()\n",
    "    return ' '.join(string)\n",
    "\n",
    "def random_title(string, threshold = 0.5):\n",
    "    string = string.split()\n",
    "    for i in range(len(string)):\n",
    "        if random.random() >= threshold:\n",
    "            string[i] = string[i].title()\n",
    "    return ' '.join(string)\n",
    "\n",
    "def random_char_upper(string, threshold = 0.7):\n",
    "    result = []\n",
    "    for c in string:\n",
    "        if random.random() >= threshold:\n",
    "            c = c.upper()\n",
    "        result.append(c)\n",
    "    return ''.join(result)\n",
    "\n",
    "funcs = [remove_punc, remove_punc_random, add_punc_random, add_space_after_punc, \n",
    "         random_upper, random_title, random_char_upper, replace_random_punc]\n",
    "\n",
    "chain_funcs = [remove_punc_random, add_punc_random, add_space_after_punc, \n",
    "         random_upper, random_title, random_char_upper, replace_random_punc]\n",
    "\n",
    "def package(string, repeat = 2, repeat_chain = 5, threshold = 0.5):\n",
    "    \n",
    "    result = [string]\n",
    "    result.append(string.lower())\n",
    "    result.append(string.upper())\n",
    "    result.append(string.title())\n",
    "    \n",
    "    for _ in range(repeat):\n",
    "    \n",
    "        for func in funcs:\n",
    "            result.append(func(string))\n",
    "\n",
    "        for func in funcs:\n",
    "            result.append(func(string.lower()))\n",
    "\n",
    "        for func in funcs:\n",
    "            result.append(func(string.upper()))\n",
    "\n",
    "        for func in funcs:\n",
    "            result.append(func(string.title()))\n",
    "    \n",
    "    for _ in range(repeat_chain):\n",
    "        s = string[:]\n",
    "        for func in chain_funcs:\n",
    "            if random.random() > threshold:\n",
    "                s = func(s)\n",
    "        result.append(s)\n",
    "        \n",
    "    result = list(set(result))\n",
    "        \n",
    "    return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tqdm import tqdm\n",
    "\n",
    "def loop(strings):\n",
    "    results = []\n",
    "    for i in tqdm(range(len(strings))):\n",
    "        p = package(strings[i])\n",
    "        for row in p:\n",
    "            results.append((row, strings[i]))\n",
    "    return results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "import cleaning\n",
    "import random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "files = ['../pure-text/filtered-dumping-wiki.txt',\n",
    "        '../pure-text/dumping-cleaned-news.txt']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1992725"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with open(files[0]) as fopen:\n",
    "    data = list(filter(None, fopen.read().split('\\n')))\n",
    "    \n",
    "data = [i for i in data if len(i) >= 15]\n",
    "\n",
    "len(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = random.sample(data, 700000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 43750/43750 [00:36<00:00, 1208.31it/s]\n",
      "100%|██████████| 43750/43750 [00:36<00:00, 1200.26it/s]\n",
      "100%|██████████| 43750/43750 [00:36<00:00, 1208.62it/s]\n",
      "100%|██████████| 43750/43750 [00:36<00:00, 1199.90it/s]\n",
      "100%|██████████| 43750/43750 [00:36<00:00, 1197.28it/s]\n",
      "100%|██████████| 43750/43750 [00:36<00:00, 1200.30it/s]\n",
      "100%|██████████| 43750/43750 [00:36<00:00, 1193.82it/s]\n",
      "100%|██████████| 43750/43750 [00:36<00:00, 1198.89it/s]\n",
      "100%|██████████| 43750/43750 [00:36<00:00, 1189.47it/s]\n",
      "100%|██████████| 43750/43750 [00:36<00:00, 1193.01it/s]\n",
      "100%|██████████| 43750/43750 [00:36<00:00, 1191.08it/s]\n",
      "100%|██████████| 43750/43750 [00:36<00:00, 1188.14it/s]\n",
      "100%|██████████| 43750/43750 [00:36<00:00, 1183.98it/s]\n",
      "100%|██████████| 43750/43750 [00:36<00:00, 1184.67it/s]\n",
      "100%|██████████| 43750/43750 [00:36<00:00, 1188.06it/s]\n",
      "100%|██████████| 43750/43750 [00:37<00:00, 1178.91it/s]\n"
     ]
    }
   ],
   "source": [
    "results1 = cleaning.multiprocessing(data, loop)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "37908415"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(results1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "with open('results-wiki-single.json', 'w') as fopen:\n",
    "    json.dump(results1, fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(files[1]) as fopen:\n",
    "    data = list(filter(None, fopen.read().split('\\n')))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3483907"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "del results1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = random.sample(data, 700000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 99%|█████████▉| 43351/43750 [00:42<00:00, 1044.61it/s]\n",
      "100%|██████████| 43750/43750 [00:42<00:00, 1026.36it/s]\n",
      "100%|██████████| 43750/43750 [00:42<00:00, 1025.76it/s]\n",
      "100%|██████████| 43750/43750 [00:42<00:00, 1028.30it/s]\n",
      "100%|██████████| 43750/43750 [00:42<00:00, 1027.74it/s]\n",
      "100%|██████████| 43750/43750 [00:42<00:00, 1022.46it/s]\n",
      "100%|██████████| 43750/43750 [00:42<00:00, 1023.04it/s]\n",
      "100%|██████████| 43750/43750 [00:42<00:00, 1020.42it/s]\n",
      "100%|██████████| 43750/43750 [00:42<00:00, 1022.49it/s]\n",
      " 99%|█████████▉| 43286/43750 [00:42<00:00, 1029.86it/s]\n",
      "100%|██████████| 43750/43750 [00:42<00:00, 1019.28it/s]\n",
      "100%|██████████| 43750/43750 [00:42<00:00, 1026.34it/s]\n",
      "100%|██████████| 43750/43750 [00:43<00:00, 1016.05it/s]\n",
      "100%|██████████| 43750/43750 [00:42<00:00, 1021.46it/s]\n",
      "100%|██████████| 43750/43750 [00:42<00:00, 1020.75it/s]\n",
      "100%|██████████| 43750/43750 [00:43<00:00, 1015.97it/s]\n"
     ]
    }
   ],
   "source": [
    "results1 = cleaning.multiprocessing(data, loop)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "38280627"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(results1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('results-news-single.json', 'w') as fopen:\n",
    "    json.dump(results1, fopen)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
